GroupBy¶
In [1]:
import pandas as pd
# create a dataframe of car models by two companies
df = pd.DataFrame({
'Company': ['A', 'A', 'A', 'B', 'B', 'B', 'B'],
'Model': ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4'],
'Year': [2019, 2020, 2021, 2018, 2019, 2020, 2021],
'Transmission': ['Manual', 'Automatic', 'Automatic',
'Manual', 'Automatic', 'Automatic', 'Manual'],
'EngineSize': [1.4, 2.0, 1.4, 1.5, 2.0, 1.5, 1.5],
'MPG': [55.4, 67.3, 58.9, 52.3, 64.2, 68.9, 83.1]
})
# display the dataframe
df
Out[1]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 0 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 2 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 3 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 4 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 5 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 6 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
In [4]:
# groupby columns on Col1 and estimate the
# maximum value of column Col2 for each group
# df.groupby([Col1])[Col2].max()
df.groupby(["Company"])["MPG"].max()
Out[4]:
Company A 67.3 B 83.1 Name: MPG, dtype: float64
In [5]:
# alternatively, you can pass 'max' to the agg() function
df.groupby(["Company"])["MPG"].agg('max')
Out[5]:
Company A 67.3 B 83.1 Name: MPG, dtype: float64
In [10]:
df.groupby(["Company"])["MPG"].agg(['max','mean','count','std'])
Out[10]:
| max | mean | count | std | |
|---|---|---|---|---|
| Company | ||||
| A | 67.3 | 60.533333 | 3 | 6.115826 |
| B | 83.1 | 67.125000 | 4 | 12.736921 |
| C | 70.0 | 70.000000 | 1 | NaN |
Add new row¶
Use pd.concat([df, new_df], ignore_index=True) to add new row of data.¶
In [7]:
# DO NOT USE df.append()
df.append(pd.DataFrame({
"Company":["C"],
"Model":["C1"],
"Year":[2023],
"Transmission":["Automatic"],
"EngineSize":[1.8] ,
"MPG":[70],
}))
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[7], line 2 1 # DO NOT USE df.append() ----> 2 df.append(pd.DataFrame({ 3 "Company":["C"], 4 "Model":["C1"], 5 "Year":[2023], 6 "Transmission":["Automatic"], 7 "EngineSize":[1.8] , 8 "MPG":[70], 9 })) File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:6204, in NDFrame.__getattr__(self, name) 6197 if ( 6198 name not in self._internal_names_set 6199 and name not in self._metadata 6200 and name not in self._accessors 6201 and self._info_axis._can_hold_identifiers_and_holds_name(name) 6202 ): 6203 return self[name] -> 6204 return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'append'
In [8]:
df = pd.concat([df, pd.DataFrame({
"Company":["C"],
"Model":["C1"],
"Year":[2023],
"Transmission":["Automatic"],
"EngineSize":[1.8] ,
"MPG":[70],
})
], ignore_index=True)
In [9]:
df
Out[9]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 0 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 2 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 3 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 4 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 5 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 6 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
| 7 | C | C1 | 2023 | Automatic | 1.8 | 70.0 |
Access a row¶
In [8]:
df.iloc[5]
Out[8]:
Company B Model B3 Year 2020 Transmission Automatic EngineSize 1.5 MPG 68.9 Name: 5, dtype: object
df.sort_values¶
In [11]:
df.sort_values(by=["Year","MPG"], ascending=[True,False])
Out[11]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 3 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 4 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 0 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 5 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 6 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
| 2 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 7 | C | C1 | 2023 | Automatic | 1.8 | 70.0 |
In [10]:
df1 = df.sort_values(by="Year", ascending=True)
df1
Out[10]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 3 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 0 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 4 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 5 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 2 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 6 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
| 7 | C | C1 | 2023 | Automatic | 1.8 | 70.0 |
In [12]:
df1 = df.sort_values(by=["Company","Year"], ascending=[True,False])
df1
Out[12]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 2 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 0 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 6 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
| 5 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 4 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 3 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 7 | C | C1 | 2023 | Automatic | 1.8 | 70.0 |
df.set_index¶
In [13]:
df.set_index('Model')
Out[13]:
| Company | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|
| Model | |||||
| A1 | A | 2019 | Manual | 1.4 | 55.4 |
| A2 | A | 2020 | Automatic | 2.0 | 67.3 |
| A3 | A | 2021 | Automatic | 1.4 | 58.9 |
| B1 | B | 2018 | Manual | 1.5 | 52.3 |
| B2 | B | 2019 | Automatic | 2.0 | 64.2 |
| B3 | B | 2020 | Automatic | 1.5 | 68.9 |
| B4 | B | 2021 | Manual | 1.5 | 83.1 |
| C1 | C | 2023 | Automatic | 1.8 | 70.0 |
df.reset_index¶
In [15]:
df1.reset_index(drop=True)
Out[15]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 0 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 2 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 3 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
| 4 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 5 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 6 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 7 | C | C1 | 2023 | Automatic | 1.8 | 70.0 |
Alter column/series order¶
In [14]:
df
Out[14]:
| Company | Model | Year | Transmission | EngineSize | MPG | |
|---|---|---|---|---|---|---|
| 0 | A | A1 | 2019 | Manual | 1.4 | 55.4 |
| 1 | A | A2 | 2020 | Automatic | 2.0 | 67.3 |
| 2 | A | A3 | 2021 | Automatic | 1.4 | 58.9 |
| 3 | B | B1 | 2018 | Manual | 1.5 | 52.3 |
| 4 | B | B2 | 2019 | Automatic | 2.0 | 64.2 |
| 5 | B | B3 | 2020 | Automatic | 1.5 | 68.9 |
| 6 | B | B4 | 2021 | Manual | 1.5 | 83.1 |
| 7 | C | C1 | 2023 | Automatic | 1.8 | 70.0 |
In [15]:
df[["MPG","EngineSize","Transmission","Company","Model","Year"]]
Out[15]:
| MPG | EngineSize | Transmission | Company | Model | Year | |
|---|---|---|---|---|---|---|
| 0 | 55.4 | 1.4 | Manual | A | A1 | 2019 |
| 1 | 67.3 | 2.0 | Automatic | A | A2 | 2020 |
| 2 | 58.9 | 1.4 | Automatic | A | A3 | 2021 |
| 3 | 52.3 | 1.5 | Manual | B | B1 | 2018 |
| 4 | 64.2 | 2.0 | Automatic | B | B2 | 2019 |
| 5 | 68.9 | 1.5 | Automatic | B | B3 | 2020 |
| 6 | 83.1 | 1.5 | Manual | B | B4 | 2021 |
| 7 | 70.0 | 1.8 | Automatic | C | C1 | 2023 |
In [16]:
df.iloc[:,[5,4,3,0,1,2]]
Out[16]:
| MPG | EngineSize | Transmission | Company | Model | Year | |
|---|---|---|---|---|---|---|
| 0 | 55.4 | 1.4 | Manual | A | A1 | 2019 |
| 1 | 67.3 | 2.0 | Automatic | A | A2 | 2020 |
| 2 | 58.9 | 1.4 | Automatic | A | A3 | 2021 |
| 3 | 52.3 | 1.5 | Manual | B | B1 | 2018 |
| 4 | 64.2 | 2.0 | Automatic | B | B2 | 2019 |
| 5 | 68.9 | 1.5 | Automatic | B | B3 | 2020 |
| 6 | 83.1 | 1.5 | Manual | B | B4 | 2021 |
| 7 | 70.0 | 1.8 | Automatic | C | C1 | 2023 |
Series.unique¶
In [17]:
df["EngineSize"].unique()
Out[17]:
array([1.4, 2. , 1.5, 1.8])
In [27]:
df["Company"].unique().tolist()
Out[27]:
['A', 'B', 'C']
In [19]:
df["Year"].unique()
Out[19]:
array([2019, 2020, 2021, 2018, 2023])
Series.nunique¶
In [20]:
df["EngineSize"].nunique()
Out[20]:
4
In [21]:
df["Year"].nunique()
Out[21]:
5
In [22]:
df["Transmission"].nunique()
Out[22]:
2
Return a Series containing counts of unique values.¶
In [20]:
df["Year"].value_counts()
Out[20]:
Year 2019 2 2020 2 2021 2 2018 1 2023 1 Name: count, dtype: int64
pd.read_json¶
In [47]:
link = "https://api.data.gov.hk/v2/filter?q=%7B%22resource%22%3A%22http%3A%2F%2Fwww.rvd.gov.hk%2Fdoc%2Fen%2Fstatistics%2Fhis_data_16.xls%22%2C%22section%22%3A1%2C%22format%22%3A%22json%22%7D"
data = pd.read_json(link)
In [48]:
data.columns
Out[48]:
Index(['年 Year', '月 Month', '一手買賣 數目 Primary Sales No.',
'一手買賣 總值 (百萬元) Primary Sales Consideration ($ million)',
'二手買賣 數目 Secondary Sales No.',
'二手買賣 總值 (百萬元) Secondary Sales Consideration ($ million)'],
dtype='object')
In [53]:
# change the column name
data = data.rename(columns = {'年 Year':'Year',
'月 Month':'Month',
'一手買賣 數目 Primary Sales No.':'FirstHandSales',
'一手買賣 總值 (百萬元) Primary Sales Consideration ($ million)':
'FirstConsideration',
'二手買賣 數目 Secondary Sales No.':'SecondHandSales',
'二手買賣 總值 (百萬元) Secondary Sales Consideration ($ million)':
'SecondConsideration'
})
In [56]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 266 entries, 0 to 265 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 266 non-null int64 1 Month 266 non-null int64 2 FirstHandSales 266 non-null int64 3 FirstConsideration 266 non-null int64 4 SecondHandSales 266 non-null int64 5 SecondConsideration 266 non-null int64 dtypes: int64(6) memory usage: 12.6 KB
In [57]:
# join Year and Month as new column
data["DateTime"] = data["Year"].astype(str) + "-" + data["Month"].astype(str)
# drop Year and Month
data = data.drop(['Year','Month'], axis=1)
# set DateTime as index
data = data.set_index('DateTime')
data
Out[57]:
| FirstHandSales | FirstConsideration | SecondHandSales | SecondConsideration | |
|---|---|---|---|---|
| DateTime | ||||
| 2002-1 | 2718 | 8218 | 5129 | 9966 |
| 2002-2 | 1506 | 4305 | 4189 | 8070 |
| 2002-3 | 1709 | 4190 | 3741 | 6795 |
| 2002-4 | 2543 | 6516 | 4866 | 8739 |
| 2002-5 | 2257 | 5844 | 5068 | 11097 |
| ... | ... | ... | ... | ... |
| 2023-10 | 356 | 4805 | 1767 | 19693 |
| 2023-11 | 547 | 6318 | 2007 | 13352 |
| 2023-12 | 959 | 9684 | 1970 | 14640 |
| 2024-1 | 1003 | 8336 | 2474 | 19456 |
| 2024-2 | 367 | 5677 | 2008 | 13419 |
266 rows × 4 columns
In [58]:
data.columns
Out[58]:
Index(['FirstHandSales', 'FirstConsideration', 'SecondHandSales',
'SecondConsideration'],
dtype='object')
In [60]:
# pip install plotly
Requirement already satisfied: plotly in c:\users\user\anaconda3\lib\site-packages (5.19.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\user\anaconda3\lib\site-packages (from plotly) (8.2.2) Requirement already satisfied: packaging in c:\users\user\anaconda3\lib\site-packages (from plotly) (23.1) Note: you may need to restart the kernel to use updated packages.
In [59]:
import plotly.express as px
fig = px.line(data, x=data.index, y=["FirstHandSales","FirstConsideration"],
title='First Hand Sales')
fig.show()
In [66]:
data.corr()
Out[66]:
| FirstHandSales | FirstConsideration | SecondHandSales | SecondConsideration | |
|---|---|---|---|---|
| FirstHandSales | 1.000000 | 0.540893 | 0.131269 | 0.046023 |
| FirstConsideration | 0.540893 | 1.000000 | -0.014293 | 0.554104 |
| SecondHandSales | 0.131269 | -0.014293 | 1.000000 | 0.491342 |
| SecondConsideration | 0.046023 | 0.554104 | 0.491342 | 1.000000 |
In [67]:
data['FirstHandSales'].corr(data['SecondHandSales'])
Out[67]:
0.13126927888869652
In [64]:
fig = px.imshow(data, text_auto=True)
fig.show()
In [70]:
data#.T.T
Out[70]:
| FirstHandSales | FirstConsideration | SecondHandSales | SecondConsideration | |
|---|---|---|---|---|
| DateTime | ||||
| 2002-1 | 2718 | 8218 | 5129 | 9966 |
| 2002-2 | 1506 | 4305 | 4189 | 8070 |
| 2002-3 | 1709 | 4190 | 3741 | 6795 |
| 2002-4 | 2543 | 6516 | 4866 | 8739 |
| 2002-5 | 2257 | 5844 | 5068 | 11097 |
| ... | ... | ... | ... | ... |
| 2023-10 | 356 | 4805 | 1767 | 19693 |
| 2023-11 | 547 | 6318 | 2007 | 13352 |
| 2023-12 | 959 | 9684 | 1970 | 14640 |
| 2024-1 | 1003 | 8336 | 2474 | 19456 |
| 2024-2 | 367 | 5677 | 2008 | 13419 |
266 rows × 4 columns
In [75]:
data.to_numpy()
Out[75]:
array([[ 2718, 8218, 5129, 9966],
[ 1506, 4305, 4189, 8070],
[ 1709, 4190, 3741, 6795],
...,
[ 959, 9684, 1970, 14640],
[ 1003, 8336, 2474, 19456],
[ 367, 5677, 2008, 13419]], dtype=int64)
In [77]:
data["FirstHandSales"].to_numpy()
Out[77]:
array([2718, 1506, 1709, 2543, 2257, 2087, 755, 796, 3278, 2282, 1340,
1817, 2291, 966, 1655, 2431, 1257, 1602, 2765, 2766, 1867, 3958,
2602, 2338, 2894, 3734, 3705, 1379, 961, 2100, 2421, 1099, 1558,
1801, 2576, 1466, 1056, 2092, 993, 918, 2471, 2028, 1150, 569,
524, 2762, 1228, 203, 148, 251, 895, 781, 1137, 1105, 552,
1421, 3929, 1165, 953, 1649, 994, 1267, 834, 2410, 2293, 1173,
1402, 1886, 2434, 1901, 2573, 956, 1702, 690, 2399, 540, 638,
2058, 457, 216, 917, 403, 131, 895, 507, 232, 863, 1587,
2525, 2717, 1307, 2301, 1532, 627, 753, 1210, 975, 564, 2028,
1975, 685, 1318, 1137, 1531, 556, 1656, 747, 474, 296, 486,
1123, 741, 2054, 1050, 608, 884, 231, 822, 1189, 1396, 879,
581, 1435, 804, 1092, 827, 1670, 1397, 644, 2264, 1066, 309,
632, 1197, 1095, 1045, 1328, 135, 211, 546, 871, 773, 1151,
2062, 1760, 1142, 693, 1136, 841, 1375, 2507, 1606, 1182, 1723,
1119, 1773, 1465, 1540, 738, 1634, 1808, 1547, 1122, 764, 1602,
1404, 1049, 2153, 450, 234, 641, 1294, 1476, 1538, 1142, 1628,
3474, 2251, 2212, 453, 664, 1474, 1435, 2650, 1275, 2320, 952,
1236, 2389, 1541, 1416, 1293, 828, 1168, 738, 2122, 756, 2070,
1753, 1490, 1252, 2337, 720, 399, 2191, 1654, 1468, 2789, 3246,
1191, 1810, 1252, 1338, 1387, 2112, 670, 626, 1016, 619, 775,
1179, 2195, 1673, 1068, 808, 1046, 2201, 2111, 586, 1553, 1553,
1067, 1572, 1954, 2052, 708, 1926, 1297, 1868, 1514, 1081, 481,
161, 258, 1492, 1382, 1177, 1564, 1435, 672, 365, 247, 363,
655, 1787, 1550, 1000, 1019, 810, 742, 964, 356, 547, 959,
1003, 367], dtype=int64)
Time Series¶
In [79]:
tnx = pd.read_csv("^TNX.csv")
tnx.tail(5)
Out[79]:
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| 257 | 2024-07-24 | 4.229 | 4.292 | 4.210 | 4.286 | 4.286 | 0.0 |
| 258 | 2024-07-25 | 4.206 | 4.266 | 4.196 | 4.256 | 4.256 | 0.0 |
| 259 | 2024-07-26 | 4.245 | 4.252 | 4.192 | 4.200 | 4.200 | 0.0 |
| 260 | 2024-07-29 | 4.163 | 4.186 | 4.151 | 4.178 | 4.178 | 0.0 |
| 261 | 2024-07-30 | 4.174 | 4.184 | 4.130 | 4.143 | 4.143 | 0.0 |
In [80]:
tnx.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 262 entries, 0 to 261 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 262 non-null object 1 Open 252 non-null float64 2 High 252 non-null float64 3 Low 252 non-null float64 4 Close 252 non-null float64 5 Adj Close 252 non-null float64 6 Volume 252 non-null float64 dtypes: float64(6), object(1) memory usage: 14.5+ KB
In [82]:
tnx['Date'] = pd.to_datetime(tnx['Date'],format="%Y-%m-%d")
tnx.Date
Out[82]:
0 2023-07-31
1 2023-08-01
2 2023-08-02
3 2023-08-03
4 2023-08-04
...
257 2024-07-24
258 2024-07-25
259 2024-07-26
260 2024-07-29
261 2024-07-30
Name: Date, Length: 262, dtype: datetime64[ns]
In [83]:
tnx = tnx.set_index('Date')
tnx.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 262 entries, 2023-07-31 to 2024-07-30 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Open 252 non-null float64 1 High 252 non-null float64 2 Low 252 non-null float64 3 Close 252 non-null float64 4 Adj Close 252 non-null float64 5 Volume 252 non-null float64 dtypes: float64(6) memory usage: 14.3 KB
In [84]:
tnx.tail(3)
Out[84]:
| Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2024-07-26 | 4.245 | 4.252 | 4.192 | 4.200 | 4.200 | 0.0 |
| 2024-07-29 | 4.163 | 4.186 | 4.151 | 4.178 | 4.178 | 0.0 |
| 2024-07-30 | 4.174 | 4.184 | 4.130 | 4.143 | 4.143 | 0.0 |
In [85]:
tnx.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 262 entries, 2023-07-31 to 2024-07-30 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Open 252 non-null float64 1 High 252 non-null float64 2 Low 252 non-null float64 3 Close 252 non-null float64 4 Adj Close 252 non-null float64 5 Volume 252 non-null float64 dtypes: float64(6) memory usage: 14.3 KB
In [106]:
df_dt = pd.DataFrame({'year': [2023, 2023],
'month': [1, 1],
'day': [1, 2],
'sales': [3750, 3900]})
df_dt
Out[106]:
| year | month | day | sales | |
|---|---|---|---|---|
| 0 | 2023 | 1 | 1 | 3750 |
| 1 | 2023 | 1 | 2 | 3900 |
In [107]:
df_dt['date'] = pd.to_datetime(df_dt[['year','month','day']])
df_dt = df_dt.drop(['year','month','day'], axis=1)
df_dt = df_dt.set_index('date')
df_dt
Out[107]:
| sales | |
|---|---|
| date | |
| 2023-01-01 | 3750 |
| 2023-01-02 | 3900 |
In [111]:
df_dt
Out[111]:
| sales | |
|---|---|
| date | |
| 2023-01-01 | 3750 |
| 2023-01-02 | 3900 |
In [112]:
df_dt.iloc[0]
Out[112]:
sales 3750 Name: 2023-01-01 00:00:00, dtype: int64
In [40]:
df_dt.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 2 entries, 2023-01-01 to 2023-01-02 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sales 2 non-null int64 dtypes: int64(1) memory usage: 32.0 bytes
In [41]:
pd.to_datetime(1681038343, unit='s')
Out[41]:
Timestamp('2023-04-09 11:05:43')
In [42]:
pd.to_datetime(1681038343433502912, unit='ns')
Out[42]:
Timestamp('2023-04-09 11:05:43.433502912')
Categorical data¶
In [2]:
animal = pd.Series(["Bird","Cat", "Dog","Elephant"], dtype="category")
In [3]:
animal.info()
<class 'pandas.core.series.Series'> RangeIndex: 4 entries, 0 to 3 Series name: None Non-Null Count Dtype -------------- ----- 4 non-null category dtypes: category(1) memory usage: 340.0 bytes
In [4]:
import numpy as np
df_exam = pd.DataFrame({'Score': np.random.randint(1, 101, size=40)})
In [5]:
df_exam.head(6)
Out[5]:
| Score | |
|---|---|
| 0 | 40 |
| 1 | 22 |
| 2 | 27 |
| 3 | 78 |
| 4 | 61 |
| 5 | 20 |
In [6]:
df_exam['Result'] = pd.Series(np.where(df_exam['Score']>70, 'Distiction',
np.where(df_exam['Score']>50, 'Pass', 'Fail')
)).astype('category')
In [9]:
df_exam.sample(5)
Out[9]:
| Score | Result | |
|---|---|---|
| 7 | 95 | Distiction |
| 0 | 40 | Fail |
| 27 | 70 | Pass |
| 22 | 38 | Fail |
| 38 | 85 | Distiction |
In [8]:
df_exam.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40 entries, 0 to 39 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Score 40 non-null int32 1 Result 40 non-null category dtypes: category(1), int32(1) memory usage: 464.0 bytes
In [ ]:
In [ ]: